Packages and files needed to run analysis. All notes on dataset construction can be found in the BS_data_construction.html file.
source("BS_func.R")
source("BS_load.R")
Null model dataset based on 8650 observations across 5 years.
The yield data available is at a county scale and the distribution of yields across space exhibits strong autocorrelation where yields in neighboring counties are more alike than yields in distant counties. This spatial autocorrelation is accounted for using a standard Conditional Autoreggressive dependency model based on adjacency for all counties in the conterminous US. In order to account for additional county-specific factors that contribute to yields a county iid random effect term is also included, yielding a Besag-York-Mollie (BYM) spatial dependency model. A county adjacency matrix is created for each crop under investigation. For regional models a seperate county adjacency matrix for each region-crop combination is created.
# rook structure
library(INLA)
library(spdep)
library(spdplyr)
county_sub <- county %>% filter(GEOID %in% unique(null$GEOID)) %>%
arrange(GEOID)
county_sub$ID <- 1:nrow(county_sub@data)
# relational matrix
temp <- poly2nb(county_sub, queen=F) # rook
h_adj <- nb2mat(temp, style="B", zero.policy = T)
h_adj <- as(h_adj, "dgTMatrix") # sparse style matrix conversion
saveRDS(h_adj, "./data/neighborhood.RDS")
# queen structure
library(INLA)
library(spdep)
library(spdplyr)
county_sub <- county %>% filter(GEOID %in% unique(null$GEOID)) %>%
arrange(GEOID)
county_sub$ID <- 1:nrow(county_sub@data)
# relational matrix
temp <- poly2nb(county_sub, queen=T)
h_adj <- nb2mat(temp, style="B", zero.policy = T)
h_adj <- as(h_adj, "dgTMatrix") # sparse style matrix conversion
saveRDS(h_adj, "./data/neighborhood_queen.RDS")
Load spatial structure, filter missing data, add unique ID for GEOID, round climate data to decrease computation time, build priors for all models:
# load adjacency matrix
hadj <- readRDS("./data/neighborhood.RDS") # rook
# other data prep in BS_load.R
Counties included in final null dataset:
y <- null %>% group_by(GEOID) %>% summarize(mnYIELD = mean(YIELD, na.rm=T))
cty_check <- merge(county_sub, y, by = "GEOID", all=T)
spplot(cty_check, "mnYIELD")
Number of counties with available data in each of the LRR groups:
library(RColorBrewer)
r <- null %>% group_by(LRR) %>% summarize(n = length(unique(GEOID)))
reg_plot <- merge(as(lrr_shp, "Spatial"), r, by.x="ID", by.y = "LRR")
my.palette <- brewer.pal(n = 9, name = "OrRd")
spplot(reg_plot, "n", col.regions = my.palette, cuts=8, col = "gray")
library(knitr)
kable(reg_plot)
| ID | NAME | n |
|---|---|---|
| 2 | Atlantic and Gulf Coast Lowland Forest and Crop Region | 79 |
| 3 | California Subtropical Fruit, Truck, and Specialty Crop Region | 15 |
| 5 | Central Feed Grains and Livestock Region | 513 |
| 6 | Central Great Plains Winter Wheat and Range Region | 155 |
| 7 | East and Central Farming and Forest Region | 261 |
| 8 | Florida Subtropical Fruit, Truck Crop, and Range Region | NA |
| 12 | Lake State Fruit, Truck Crop, and Dairy Region | 78 |
| 13 | Mississippi Delta Cotton and Feed Grains Region | 47 |
| 14 | Northeastern Forage and Forest Region | 59 |
| 16 | Northern Atlantic Slope Diversified Farming Region | 67 |
| 17 | Northern Great Plains Spring Wheat Region | 90 |
| 18 | Northern Lake States Forest and Forage Region | 85 |
| 19 | Northwestern Forest, Forage, and Specialty Crop Region | NA |
| 20 | Northwestern Wheat and Range Region | 16 |
| 22 | Rocky Mountain Range and Forest Region | 11 |
| 23 | South Atlantic and Gulf Slope Cash Crops, Forest, and Livestock Region | 289 |
| 25 | Southwest Plateaus and Plains Range and Cotton Region | 20 |
| 26 | Southwestern Prairies Cotton and Forage Region | 35 |
| 28 | Western Great Plains Range and Irrigated Region | 58 |
| 29 | Western Range and Irrigated Region | 18 |
This got quite long, so all of the model development and sensitivity testing is well-documented in the Sensitivity_testing document. This doc also has full descriptions of the models to help with writing up methods. Results are presented for:
State-level results performed worse than these models, but are in the Archive folder just in case (null_STATE.RDS with code in BS_analysis_June19.RMD).
# Define priors
apar <- 0.5
lmod <- lm(YIELD ~ GDD + SDD + EFP + EXP + CORN_SUIT + factor(YEAR), null)
bpar <- apar*var(residuals(lmod))
lgprior_iid <- list(prec = list(prior="loggamma", param = c(apar,bpar)))
sdres <- sd(residuals(lmod))
lgprior_bym <- list(prec = list(prior="pc.prec", param = c(3*sdres, 0.01)))
u <- 0.6
# Specify formula
formula <- YIELD ~ 1 + f(ID, model="bym2", # county-level spatial structure
graph=hadj,
scale.model=TRUE,
constr = TRUE,
hyper= lgprior_bym) +
CORN_SUIT +
f(LRR, model = "iid", hyper = lgprior_iid) +
f(GDD, model = "rw1", scale.model = T, constr = T, hyper = list(theta = list(prior = "pc.prec",
param = c(u, 0.01)))) +
f(SDD, model = "rw1", scale.model = T, constr = T, hyper = list(theta = list(prior = "pc.prec",
param = c(u, 0.01)))) +
f(TP, model = "rw1", scale.model = T, constr = T, hyper = list(theta = list(prior = "pc.prec",
param = c(u, 0.01)))) +
factor(YEAR)
modr <- inla(formula, data=null, family="gaussian",
control.predictor = list(compute=T),
control.compute = list(dic=T, cpo=T))
# d <- model_diagnostics(modr, null)
saveRDS(modr, "./out/null_LRR.RDS")
modr <- readRDS("./out/null_LRR.RDS")
modd <- model_diagnostics(modr, null)
Precision estimates are all small, fixed effects make intuitive sense. Deviance is reasonable as compared to other models.
summary(modr)
##
## Call:
## c("inla(formula = formula, family = \"gaussian\", data = null,
## control.compute = list(dic = T, ", " cpo = T), control.predictor =
## list(compute = T))")
## Time used:
## Pre = 50, Running = 729, Post = 2.98, Total = 782
## Fixed effects:
## mean sd 0.025quant 0.5quant 0.975quant mode kld
## (Intercept) 70.022 7.534 55.230 70.022 84.801 70.022 0
## CORN_SUIT 65.381 4.210 57.115 65.381 73.641 65.381 0
## factor(YEAR)2002 5.977 0.705 4.592 5.977 7.361 5.977 0
## factor(YEAR)2007 22.563 0.714 21.161 22.563 23.963 22.563 0
## factor(YEAR)2012 21.085 0.759 19.594 21.085 22.574 21.085 0
## factor(YEAR)2017 43.802 0.683 42.461 43.802 45.143 43.802 0
##
## Random effects:
## Name Model
## ID BYM2 model
## LRR IID model
## GDD RW1 model
## SDD RW1 model
## TP RW1 model
##
## Model hyperparameters:
## mean sd 0.025quant 0.5quant
## Precision for the Gaussian observations 0.004 0.000 0.004 0.004
## Precision for ID 0.004 0.000 0.003 0.004
## Phi for ID 0.129 0.016 0.103 0.128
## Precision for LRR 0.001 0.000 0.001 0.001
## Precision for GDD 0.032 0.010 0.014 0.032
## Precision for SDD 0.017 0.002 0.013 0.017
## Precision for TP 0.126 0.031 0.082 0.121
## 0.975quant mode
## Precision for the Gaussian observations 0.004 0.004
## Precision for ID 0.004 0.004
## Phi for ID 0.164 0.123
## Precision for LRR 0.002 0.001
## Precision for GDD 0.052 0.031
## Precision for SDD 0.022 0.016
## Precision for TP 0.200 0.110
##
## Expected number of effective parameters(stdev): 1607.18(0.00)
## Number of equivalent replicates : 5.38
##
## Deviance Information Criterion (DIC) ...............: 77583.77
## Deviance Information Criterion (DIC, saturated) ....: 13517.31
## Effective number of parameters .....................: 1548.70
##
## Marginal log-Likelihood: -40655.54
## CPO and PIT are computed
##
## Posterior marginals for the linear predictor and
## the fitted values are computed
Assess non-linearities:
sdd <- nonlinear_effect(modr, "SDD")
gdd <- nonlinear_effect(modr, "GDD")
tp <- nonlinear_effect(modr, "TP")
sdd
gdd
tp
GDD and SDD reflect known and well-documented relationships. TP response curve is odd. How about area-specific residuals (ui). These are the modr$summary.random$ID[1:n, c(1, 2, 3)] values.
modcty <- spatial_effects(modr, null, level="ID")
modcty
Now for the spatially-structured residuals only:
modres <- spatial_residuals(modr, null, level="ID")
modres
How about regional (LRR) effects:
modlrr <- spatial_effects(modr, null, level="LRR")
modlrr
Look at residuals:
res.lin <- (null$YIELD - modr$summary.fitted.values$mean)/modr$summary.fitted.values$sd # summary of presducted mu hat values
hist(res.lin, 100)
How about general model diagnostics:
modd$DIC
## [1] 77583.78
modd$CPO
## [1] -39281
modd$MSE
## [1] 315.7055
modd$R2
## [1] 0.717921
The probability integral transform (PIT) is another leave-one-out-cross-validation check that evaluates the predictive performance of the model where a Uniform distribution of PIT means the predictive distribution matches well with the data. In our case, the thing tends towards uniform, with some extreme values in both directions (consider trimming or inspecting these outliers).
modd$PIT
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The first PPC shows a scatterplot of the posterior means for the predictive distribution and observed values, where points scattered along a line of slope 1 indicates a very good fit.
modd$PPC_PT
The second PPC provides a histogram of the posterior predictive p-value which estimates the probability of predicting a value more extreme than the observed value, where values near 0 and 1 indicate that the model does not adequately account for the tails of the observed distribution. The tails are due to isolated county-year events which seem to correspond with disaster records.
modd$PPC_HIST
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Finally, compute the proportion of variance explained by the structured spatial component:
# p.186, only if scale.model=T
marg.hyper <- inla.hyperpar.sample(100000, modr)
pv <- mean(marg.hyper[,1]/(marg.hyper[,1]+marg.hyper[,2]))
The proportion of spatial variance is 0.5185398 suggesting that a significant amount of the variaiblity is explained by spatial structure.
lrr <- find_bs(modr, null, level = "LRR", th=2)
mdp <- brewer.pal(n = 9, name = "BrBG")
lrr.layer <- list("sp.polygons", as(lrr_shp, "Spatial"), col = "gray", first=F)
spplot(lrr, "DIFF", main = "Difference between regional and county effects", col.regions = mdp, cuts = 8, col = "transparent", sp.layout = lrr.layer)
plot_bsds(lrr)
lrr <- find_bs(modr, null, level = "LRR", th=1.5)
plot_bsds(lrr)
# Define priors
apar <- 0.5
lmod <- lm(YIELD ~ GDD + SDD + EFP + EXP + CORN_SUIT + factor(YEAR), null)
bpar <- apar*var(residuals(lmod))
lgprior_iid <- list(prec = list(prior="loggamma", param = c(apar,bpar)))
sdres <- sd(residuals(lmod))
lgprior_bym <- list(prec = list(prior="pc.prec", param = c(3*sdres, 0.01)))
u <- 0.6
# Specify formula
formula <- YIELD ~ 1 + f(ID, model="bym2", # county-level spatial structure
graph=hadj,
scale.model=TRUE,
constr = TRUE,
hyper= lgprior_bym) +
CORN_SUIT +
f(ECO, model = "iid", hyper = lgprior_iid) +
f(GDD, model = "rw1", scale.model = T, constr = T, hyper = list(theta = list(prior = "pc.prec",
param = c(u, 0.01)))) +
f(SDD, model = "rw1", scale.model = T, constr = T, hyper = list(theta = list(prior = "pc.prec",
param = c(u, 0.01)))) +
f(TP, model = "rw1", scale.model = T, constr = T, hyper = list(theta = list(prior = "pc.prec",
param = c(u, 0.01)))) +
factor(YEAR)
modr <- inla(formula, data=null, family="gaussian",
control.predictor = list(compute=T),
control.compute = list(dic=T, cpo=T))
# d <- model_diagnostics(modr, null)
saveRDS(modr, "./out/null_ECO.RDS")
modr <- readRDS("./out/null_ECO.RDS")
modd <- model_diagnostics(modr, null)
Precision estimates are all small, fixed effects make intuitive sense. Deviance is reasonable as compared to other models.
summary(modr)
##
## Call:
## c("inla(formula = formula, family = \"gaussian\", data = null,
## control.compute = list(dic = T, ", " cpo = T), control.predictor =
## list(compute = T))")
## Time used:
## Pre = 48.7, Running = 1935, Post = 5.23, Total = 1989
## Fixed effects:
## mean sd 0.025quant 0.5quant 0.975quant mode kld
## (Intercept) 76.212 8.583 59.361 76.203 93.087 76.187 0
## CORN_SUIT 58.918 4.935 49.229 58.918 68.600 58.918 0
## factor(YEAR)2002 5.360 0.853 3.686 5.360 7.033 5.360 0
## factor(YEAR)2007 22.762 0.853 21.086 22.762 24.436 22.762 0
## factor(YEAR)2012 20.235 0.929 18.411 20.235 22.057 20.235 0
## factor(YEAR)2017 44.617 0.815 43.017 44.617 46.215 44.617 0
##
## Random effects:
## Name Model
## ID BYM2 model
## ECO IID model
## GDD RW1 model
## SDD RW1 model
## TP RW1 model
##
## Model hyperparameters:
## mean sd 0.025quant 0.5quant
## Precision for the Gaussian observations 0.002 0.000 0.002 0.002
## Precision for ID 0.002 0.000 0.002 0.002
## Phi for ID 0.208 0.009 0.190 0.208
## Precision for ECO 0.001 0.000 0.001 0.001
## Precision for GDD 0.042 0.003 0.036 0.042
## Precision for SDD 0.020 0.001 0.018 0.020
## Precision for TP 0.137 0.015 0.108 0.136
## 0.975quant mode
## Precision for the Gaussian observations 0.003 0.002
## Precision for ID 0.002 0.002
## Phi for ID 0.229 0.207
## Precision for ECO 0.001 0.001
## Precision for GDD 0.049 0.042
## Precision for SDD 0.022 0.020
## Precision for TP 0.168 0.135
##
## Expected number of effective parameters(stdev): 1642.75(3.80)
## Number of equivalent replicates : 5.27
##
## Deviance Information Criterion (DIC) ...............: 77893.46
## Deviance Information Criterion (DIC, saturated) ....: 10131.68
## Effective number of parameters .....................: 1638.43
##
## Marginal log-Likelihood: -40510.33
## CPO and PIT are computed
##
## Posterior marginals for the linear predictor and
## the fitted values are computed
Assess non-linearities:
sdd <- nonlinear_effect(modr, "SDD")
gdd <- nonlinear_effect(modr, "GDD")
tp <- nonlinear_effect(modr, "TP")
sdd
gdd
tp
GDD and SDD reflect known and well-documented relationships. TP response curve is odd. How about area-specific residuals (ui). These are the modr$summary.random$ID[1:n, c(1, 2, 3)] values.
modcty <- spatial_effects(modr, null, level="ID")
modcty
Now for the spatially-structured residuals only:
modres <- spatial_residuals(modr, null, level="ID")
modres
How about regional (ECO) effects:
modeco <- spatial_effects(modr, null, level="ECO")
modlrr
Look at residuals:
res.lin <- (null$YIELD - modr$summary.fitted.values$mean)/modr$summary.fitted.values$sd # summary of presducted mu hat values
hist(res.lin, 100)
How about general model diagnostics:
modd$DIC
## [1] 77893.46
modd$CPO
## [1] -39045.83
modd$MSE
## [1] 318.3804
modd$R2
## [1] 0.7055102
The probability integral transform (PIT) is another leave-one-out-cross-validation check that evaluates the predictive performance of the model where a Uniform distribution of PIT means the predictive distribution matches well with the data. In our case, the thing tends towards uniform, with some extreme values in both directions (consider trimming or inspecting these outliers).
modd$PIT
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The first PPC shows a scatterplot of the posterior means for the predictive distribution and observed values, where points scattered along a line of slope 1 indicates a very good fit.
modd$PPC_PT
The second PPC provides a histogram of the posterior predictive p-value which estimates the probability of predicting a value more extreme than the observed value, where values near 0 and 1 indicate that the model does not adequately account for the tails of the observed distribution. The tails are due to isolated county-year events which seem to correspond with disaster records.
modd$PPC_HIST
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Finally, compute the proportion of variance explained by the structured spatial component:
# p.186, only if scale.model=T
marg.hyper <- inla.hyperpar.sample(100000, modr)
pv <- mean(marg.hyper[,1]/(marg.hyper[,1]+marg.hyper[,2]))
The proportion of spatial variance is 0.5654147 suggesting that a significant amount of the variaiblity is explained by spatial structure.
lrr <- find_bs(modr, null, level = "ECO", th=2)
mdp <- brewer.pal(n = 9, name = "BrBG")
lrr.layer <- list("sp.polygons", as(lrr_shp, "Spatial"), col = "gray", first=F)
spplot(lrr, "DIFF", main = "Difference between regional and county effects", col.regions = mdp, cuts = 8, col = "transparent", sp.layout = lrr.layer)
plot_bsds(lrr)
lrr <- find_bs(modr, null, level = "ECO", th=1.5)
plot_bsds(lrr)
# Define priors
apar <- 0.5
lmod <- lm(YIELD ~ GDD + SDD + EFP + EXP + CORN_SUIT + factor(YEAR), null)
bpar <- apar*var(residuals(lmod))
lgprior_iid <- list(prec = list(prior="loggamma", param = c(apar,bpar)))
sdres <- sd(residuals(lmod))
lgprior_bym <- list(prec = list(prior="pc.prec", param = c(3*sdres, 0.01)))
u <- 0.6
# Specify formula
formula <- YIELD ~ 1 + f(ID, model="bym2", # county-level spatial structure
graph=hadj,
scale.model=TRUE,
constr = TRUE,
hyper= lgprior_bym) +
CORN_SUIT +
f(STATE, model = "iid", hyper = lgprior_iid) +
f(GDD, model = "rw1", scale.model = T, constr = T, hyper = list(theta = list(prior = "pc.prec",
param = c(u, 0.01)))) +
f(SDD, model = "rw1", scale.model = T, constr = T, hyper = list(theta = list(prior = "pc.prec",
param = c(u, 0.01)))) +
f(TP, model = "rw1", scale.model = T, constr = T, hyper = list(theta = list(prior = "pc.prec",
param = c(u, 0.01)))) +
factor(YEAR)
modr <- inla(formula, data=null, family="gaussian",
control.predictor = list(compute=T),
control.compute = list(dic=T, cpo=T))
# d <- model_diagnostics(modr, null)
saveRDS(modr, "./out/null_STATE.RDS")
modr <- readRDS("./out/null_STATE.RDS")
modd <- model_diagnostics(modr, null)
Precision estimates are all small, fixed effects make intuitive sense. Deviance is reasonable as compared to other models.
summary(modr)
##
## Call:
## c("inla(formula = formula, family = \"gaussian\", data = null,
## control.compute = list(dic = T, ", " cpo = T), control.predictor =
## list(compute = T))")
## Time used:
## Pre = 19.2, Running = 456, Post = 0.726, Total = 476
## Fixed effects:
## mean sd 0.025quant 0.5quant 0.975quant mode kld
## (Intercept) 73.736 6.510 60.897 73.743 86.523 73.764 0
## CORN_SUIT 40.968 4.360 32.408 40.967 49.523 40.966 0
## factor(YEAR)2002 5.926 0.829 4.297 5.926 7.552 5.926 0
## factor(YEAR)2007 21.715 0.892 19.960 21.716 23.463 21.717 0
## factor(YEAR)2012 20.321 0.915 18.522 20.322 22.115 20.324 0
## factor(YEAR)2017 44.166 0.814 42.569 44.166 45.763 44.166 0
##
## Random effects:
## Name Model
## ID BYM2 model
## STATE IID model
## GDD RW1 model
## SDD RW1 model
## TP RW1 model
##
## Model hyperparameters:
## mean sd 0.025quant 0.5quant
## Precision for the Gaussian observations 0.003 0.000 0.003 0.003
## Precision for ID 0.004 0.000 0.004 0.004
## Phi for ID 0.005 0.013 0.000 0.002
## Precision for STATE 0.001 0.000 0.000 0.001
## Precision for GDD 0.036 0.006 0.026 0.036
## Precision for SDD 0.010 0.001 0.008 0.009
## Precision for TP 0.098 0.023 0.065 0.094
## 0.975quant mode
## Precision for the Gaussian observations 0.003 0.003
## Precision for ID 0.004 0.004
## Phi for ID 0.033 0.000
## Precision for STATE 0.001 0.001
## Precision for GDD 0.048 0.037
## Precision for SDD 0.012 0.009
## Precision for TP 0.154 0.085
##
## Expected number of effective parameters(stdev): 1559.89(17.81)
## Number of equivalent replicates : 5.54
##
## Deviance Information Criterion (DIC) ...............: 77660.71
## Deviance Information Criterion (DIC, saturated) ....: 10902.83
## Effective number of parameters .....................: 1557.17
##
## Marginal log-Likelihood: -40373.39
## CPO and PIT are computed
##
## Posterior marginals for the linear predictor and
## the fitted values are computed
Assess non-linearities:
sdd <- nonlinear_effect(modr, "SDD")
gdd <- nonlinear_effect(modr, "GDD")
tp <- nonlinear_effect(modr, "TP")
sdd
gdd
tp
GDD and SDD reflect known and well-documented relationships. TP response curve is odd. How about area-specific residuals (ui). These are the modr$summary.random$ID[1:n, c(1, 2, 3)] values.
modcty <- spatial_effects(modr, null, level="ID")
modcty
Now for the spatially-structured residuals only:
modres <- spatial_residuals(modr, null, level="ID")
modres
Look at residuals:
res.lin <- (null$YIELD - modr$summary.fitted.values$mean)/modr$summary.fitted.values$sd # summary of presducted mu hat values
hist(res.lin, 100)
How about general model diagnostics:
modd$DIC
## [1] 77660.71
modd$CPO
## [1] -39018.84
modd$MSE
## [1] 322.1072
modd$R2
## [1] 0.7085329
The probability integral transform (PIT) is another leave-one-out-cross-validation check that evaluates the predictive performance of the model where a Uniform distribution of PIT means the predictive distribution matches well with the data. In our case, the thing tends towards uniform, with some extreme values in both directions (consider trimming or inspecting these outliers).
modd$PIT
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The first PPC shows a scatterplot of the posterior means for the predictive distribution and observed values, where points scattered along a line of slope 1 indicates a very good fit.
modd$PPC_PT
The second PPC provides a histogram of the posterior predictive p-value which estimates the probability of predicting a value more extreme than the observed value, where values near 0 and 1 indicate that the model does not adequately account for the tails of the observed distribution. The tails are due to isolated county-year events which seem to correspond with disaster records.
modd$PPC_HIST
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Finally, compute the proportion of variance explained by the structured spatial component:
# p.186, only if scale.model=T
marg.hyper <- inla.hyperpar.sample(100000, modr)
pv <- mean(marg.hyper[,1]/(marg.hyper[,1]+marg.hyper[,2]))
The proportion of spatial variance is 0.4227594 suggesting that a significant amount of the variaiblity is explained by spatial structure.
lrr <- find_bs(modr, null, level = "STATE", th=2)
mdp <- brewer.pal(n = 9, name = "BrBG")
lrr.layer <- list("sp.polygons", as(lrr_shp, "Spatial"), col = "gray", first=F)
spplot(lrr, "DIFF", main = "Difference between regional and county effects", col.regions = mdp, cuts = 8, col = "transparent", sp.layout = lrr.layer)
plot_bsds(lrr)
lrr <- find_bs(modr, null, level = "STATE", th=1.5)
plot_bsds(lrr)
Results here use county-effects from the LRR model described above:
modr <- readRDS("./out/null_LRR.RDS")
lrr <- national_bs(modr, null, th=2)
mdp <- brewer.pal(n = 9, name = "BrBG")
lrr.layer <- list("sp.polygons", as(lrr_shp, "Spatial"), col = "gray", first=F)
spplot(lrr, "DIFF", main = "Difference between county effects and national mean", col.regions = mdp, cuts = 8, col = "transparent", sp.layout = lrr.layer)
plot_bsds(lrr)
lrr <- national_bs(modr, null, th=1.5)
plot_bsds(lrr)
# Define priors
apar <- 0.5
lmod <- lm(YIELD ~ GDD + SDD + EFP + EXP + CORN_SUIT + factor(YEAR), nulli)
bpar <- apar*var(residuals(lmod))
lgprior_iid <- list(prec = list(prior="loggamma", param = c(apar,bpar)))
sdres <- sd(residuals(lmod))
lgprior_bym <- list(prec = list(prior="pc.prec", param = c(3*sdres, 0.01)))
u <- 0.6
# Specify formula
formula <- YIELD ~ 1 + f(ID, model="bym2", # county-level spatial structure
graph=hadj,
scale.model=TRUE,
constr = TRUE,
hyper= lgprior_bym) +
CORN_SUIT +
female +
tenant +
age +
labor_expense +
chem +
machinery +
irrig +
gvt_prog +
ph_corn +
cty_cl +
PERC_NATURAL_COVER +
SDI_NOMASK +
ED_NOMASK +
RICH_NOMASK +
LPI_NOMASK +
f(LRR, model = "iid", hyper = lgprior_iid) +
f(GDD, model = "rw1", scale.model = T, constr = T, hyper = list(theta = list(prior = "pc.prec",
param = c(u, 0.01)))) +
f(SDD, model = "rw1", scale.model = T, constr = T, hyper = list(theta = list(prior = "pc.prec",
param = c(u, 0.01)))) +
f(TP, model = "rw1", scale.model = T, constr = T, hyper = list(theta = list(prior = "pc.prec",
param = c(u, 0.01)))) +
# f(chem, model = "rw1", scale.model = T, constr = T, hyper = list(theta = list(prior = "pc.prec",
# param = c(u, 0.01)))) +
# f(machinery, model = "rw1", scale.model = T, constr = T, hyper = list(theta = list(prior = "pc.prec",
# param = c(u, 0.01)))) +
# f(irrig, model = "rw1", scale.model = T, constr = T, hyper = list(theta = list(prior = "pc.prec",
# param = c(u, 0.01)))) +
# f(gvt_prog, model = "rw1", scale.model = T, constr = T, hyper = list(theta = list(prior = "pc.prec",
# param = c(u, 0.01)))) +
# f(ph_corn, model = "rw1", scale.model = T, constr = T, hyper = list(theta = list(prior = "pc.prec",
# param = c(u, 0.01)))) +
# f(cty_cl, model = "rw1", scale.model = T, constr = T, hyper = list(theta = list(prior = "pc.prec",
# param = c(u, 0.01)))) +
# f(PERC_NATURAL_COVER, model = "rw1", scale.model = T, constr = T, hyper = list(theta = list(prior = "pc.prec",
# param = c(u, 0.01)))) +
# f(RICH_NOMASK, model = "rw1", scale.model = T, constr = T, hyper = list(theta = list(prior = "pc.prec",
# param = c(u, 0.01)))) +
# f(ED_NOMASK, model = "rw1", scale.model = T, constr = T, hyper = list(theta = list(prior = "pc.prec",
# param = c(u, 0.01)))) +
# f(LPI_NOMASK, model = "rw1", scale.model = T, constr = T, hyper = list(theta = list(prior = "pc.prec",
# param = c(u, 0.01)))) +
# f(SDI_NOMASK, model = "rw1", scale.model = T, constr = T, hyper = list(theta = list(prior = "pc.prec",
# param = c(u, 0.01)))) +
factor(YEAR)
# nulli$chem<-round(nulli$chem,-1)
# nulli$machinery<-round(nulli$machinery,-1)
# nulli$irrig<-round(nulli$irrig,-1)
# nulli$gvt_prog<-round(nulli$gvt_prog,-1)
# nulli$ph_corn<-round(nulli$ph_corn,-1)
# nulli$cty_cl<-round(nulli$cty_cl,-1)
# nulli$PERC_NATURAL_COVER<-round(nulli$PERC_NATURAL_COVER,-1)
# nulli$RICH_NOMASK<-round(nulli$RICH_NOMASK,-1)
# nulli$ED_NOMASK<-round(nulli$ED_NOMASK,-1)
# nulli$LPI_NOMASK<-round(nulli$LPI_NOMASK,-1)
# nulli$SDI_NOMASK<-round(nulli$SDI_NOMASK,-1)
nulli1 <- merge(nulli, county_sub@data, by = "GEOID", all = T)
modr <- inla(formula, data=nulli1, family="gaussian",
control.predictor = list(compute=T),
control.compute = list(dic=T, cpo=T))
# d <- model_diagnostics(modr, null)
saveRDS(modr, "./out/null_LRR_preds.RDS")
nulli1 <- merge(nulli, county_sub@data, by = "GEOID", all = T)
modr <- readRDS("./out/null_LRR_preds.RDS")
modd <- model_diagnostics(modr, nulli1)
Precision estimates are all small, fixed effects make intuitive sense. Deviance is reasonable as compared to other models.
summary(modr)
##
## Call:
## c("inla(formula = formula, family = \"gaussian\", data = nulli,
## control.compute = list(dic = T, ", " cpo = T), control.predictor =
## list(compute = T))")
## Time used:
## Pre = 14.8, Running = 348, Post = 0.939, Total = 363
## Fixed effects:
## mean sd 0.025quant 0.5quant 0.975quant mode kld
## (Intercept) 24.169 13.097 -1.539 24.165 49.871 24.160 0
## CORN_SUIT 24.267 4.852 14.740 24.266 33.785 24.266 0
## female -0.106 0.084 -0.272 -0.106 0.059 -0.106 0
## tenant 0.504 0.072 0.362 0.504 0.645 0.504 0
## age 0.963 0.190 0.590 0.963 1.336 0.963 0
## labor_expense -0.011 0.007 -0.025 -0.011 0.002 -0.011 0
## chem 0.190 0.037 0.118 0.190 0.263 0.190 0
## machinery 0.006 0.003 -0.001 0.006 0.012 0.006 0
## irrig 0.595 0.047 0.503 0.595 0.687 0.595 0
## gvt_prog 0.098 0.061 -0.022 0.098 0.218 0.098 0
## ph_corn 0.335 0.031 0.273 0.335 0.397 0.335 0
## cty_cl 0.112 0.039 0.035 0.112 0.188 0.112 0
## PERC_NATURAL_COVER 4.289 4.268 -4.091 4.289 12.662 4.289 0
## SDI_NOMASK 6.526 2.130 2.345 6.526 10.704 6.526 0
## ED_NOMASK -0.134 0.018 -0.169 -0.134 -0.099 -0.134 0
## RICH_NOMASK -0.269 0.087 -0.440 -0.269 -0.098 -0.269 0
## LPI_NOMASK 0.122 0.041 0.041 0.122 0.203 0.122 0
## factor(YEAR)2002 2.224 1.071 0.120 2.224 4.326 2.224 0
## factor(YEAR)2007 14.746 1.318 12.159 14.746 17.331 14.746 0
## factor(YEAR)2012 6.854 1.641 3.633 6.854 10.072 6.854 0
## factor(YEAR)2017 35.790 2.335 31.207 35.790 40.370 35.790 0
##
## Random effects:
## Name Model
## ID BYM2 model
## LRR IID model
## GDD RW1 model
## SDD RW1 model
## TP RW1 model
##
## Model hyperparameters:
## mean sd 0.025quant 0.5quant
## Precision for the Gaussian observations 0.001 0.000 0.001 0.001
## Precision for ID 0.010 0.001 0.008 0.010
## Phi for ID 0.342 0.054 0.222 0.349
## Precision for LRR 0.002 0.001 0.001 0.002
## Precision for GDD 0.025 0.003 0.020 0.024
## Precision for SDD 0.018 0.001 0.017 0.018
## Precision for TP 0.439 0.140 0.183 0.437
## 0.975quant mode
## Precision for the Gaussian observations 0.001 0.001
## Precision for ID 0.011 0.010
## Phi for ID 0.428 0.386
## Precision for LRR 0.004 0.002
## Precision for GDD 0.032 0.023
## Precision for SDD 0.020 0.018
## Precision for TP 0.706 0.427
##
## Expected number of effective parameters(stdev): 806.91(2.72)
## Number of equivalent replicates : 11.29
##
## Deviance Information Criterion (DIC) ...............: 83855.69
## Deviance Information Criterion (DIC, saturated) ....: 6332.84
## Effective number of parameters .....................: 548.55
##
## Marginal log-Likelihood: -43005.56
## CPO and PIT are computed
##
## Posterior marginals for the linear predictor and
## the fitted values are computed
Assess non-linearities:
sdd <- nonlinear_effect(modr, "SDD")
gdd <- nonlinear_effect(modr, "GDD")
tp <- nonlinear_effect(modr, "TP")
sdd
gdd
tp
GDD and SDD reflect known and well-documented relationships. TP response curve is odd. How about area-specific residuals (ui). These are the modr$summary.random$ID[1:n, c(1, 2, 3)] values.
modcty <- spatial_effects(modr, nulli1, level="ID")
modcty
Now for the spatially-structured residuals only:
modres <- spatial_residuals(modr, nulli1, level="ID")
modres
How about regional (LRR) effects:
modlrr <- spatial_effects(modr, nulli1, level="LRR")
modlrr
Look at residuals:
res.lin <- (nulli1$YIELD - modr$summary.fitted.values$mean)/modr$summary.fitted.values$sd # summary of presducted mu hat values
hist(res.lin, 100)
How about general model diagnostics:
modd$DIC
## [1] 83855.69
modd$CPO
## [1] -41941.34
modd$MSE
## [1] 455.3346
modd$R2
## [1] 0.6335586
The probability integral transform (PIT) is another leave-one-out-cross-validation check that evaluates the predictive performance of the model where a Uniform distribution of PIT means the predictive distribution matches well with the data. In our case, the thing tends towards uniform, with some extreme values in both directions (consider trimming or inspecting these outliers).
modd$PIT
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The first PPC shows a scatterplot of the posterior means for the predictive distribution and observed values, where points scattered along a line of slope 1 indicates a very good fit.
modd$PPC_PT
The second PPC provides a histogram of the posterior predictive p-value which estimates the probability of predicting a value more extreme than the observed value, where values near 0 and 1 indicate that the model does not adequately account for the tails of the observed distribution. The tails are due to isolated county-year events which seem to correspond with disaster records.
modd$PPC_HIST
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Finally, compute the proportion of variance explained by the structured spatial component:
# p.186, only if scale.model=T
marg.hyper <- inla.hyperpar.sample(100000, modr)
pv <- mean(marg.hyper[,1]/(marg.hyper[,1]+marg.hyper[,2]))
The proportion of spatial variance is 0.1280786 suggesting that a significant amount of the variaiblity is explained by spatial structure.
lrr <- find_bs(modr, nulli1, level = "LRR", th=2)
mdp <- brewer.pal(n = 9, name = "BrBG")
lrr.layer <- list("sp.polygons", as(lrr_shp, "Spatial"), col = "gray", first=F)
spplot(lrr, "DIFF", main = "Difference between regional and county effects", col.regions = mdp, cuts = 8, col = "transparent", sp.layout = lrr.layer)
plot_bsds(lrr)
lrr <- find_bs(modr, nulli1, level = "LRR", th=1.5)
plot_bsds(lrr)
# Define priors
apar <- 0.5
lmod <- lm(YIELD ~ GDD + SDD + EFP + EXP + CORN_SUIT + factor(YEAR), nulli)
bpar <- apar*var(residuals(lmod))
lgprior_iid <- list(prec = list(prior="loggamma", param = c(apar,bpar)))
sdres <- sd(residuals(lmod))
lgprior_bym <- list(prec = list(prior="pc.prec", param = c(3*sdres, 0.01)))
u <- 0.6
# Specify formula
formula <- YIELD ~ 1 + f(ID, model="bym2", # county-level spatial structure
graph=hadj,
scale.model=TRUE,
constr = TRUE,
hyper= lgprior_bym) +
CORN_SUIT +
f(LRR, model = "iid", hyper = lgprior_iid) +
f(GDD, model = "rw1", scale.model = T, constr = T, hyper = list(theta = list(prior = "pc.prec",
param = c(u, 0.01)))) +
f(SDD, model = "rw1", scale.model = T, constr = T, hyper = list(theta = list(prior = "pc.prec",
param = c(u, 0.01)))) +
f(TP, model = "rw1", scale.model = T, constr = T, hyper = list(theta = list(prior = "pc.prec",
param = c(u, 0.01)))) +
factor(YEAR)
nulli2 <- merge(nulli, county_sub@data, by = "GEOID", all = T)
nulli2 <- nulli2 %>% filter(ph_corn > 20)
modr <- inla(formula, data=nulli2, family="gaussian",
control.predictor = list(compute=T),
control.compute = list(dic=T, cpo=T))
# d <- model_diagnostics(modr, null)
saveRDS(modr, "./out/null_LRR_cornsub.RDS")
nulli2 <- merge(nulli, county_sub@data, by = "GEOID", all = T)
nulli2 <- nulli2 %>% filter(ph_corn > 20)
modr <- readRDS("./out/null_LRR_cornsub.RDS")
modd <- model_diagnostics(modr, nulli2)
Precision estimates are all small, fixed effects make intuitive sense. Deviance is reasonable as compared to other models.
summary(modr)
##
## Call:
## c("inla(formula = formula, family = \"gaussian\", data = nulli2,
## control.compute = list(dic = T, ", " cpo = T), control.predictor =
## list(compute = T))")
## Time used:
## Pre = 13.4, Running = 154, Post = 0.487, Total = 168
## Fixed effects:
## mean sd 0.025quant 0.5quant 0.975quant mode kld
## (Intercept) 72.325 12.080 48.608 72.325 96.023 72.325 0
## CORN_SUIT 75.783 5.064 65.840 75.783 85.717 75.783 0
## factor(YEAR)2002 8.288 1.169 5.992 8.288 10.582 8.288 0
## factor(YEAR)2007 24.337 1.177 22.026 24.337 26.645 24.337 0
## factor(YEAR)2012 21.471 1.240 19.036 21.471 23.903 21.471 0
## factor(YEAR)2017 47.945 1.077 45.831 47.945 50.058 47.945 0
##
## Random effects:
## Name Model
## ID BYM2 model
## LRR IID model
## GDD RW1 model
## SDD RW1 model
## TP RW1 model
##
## Model hyperparameters:
## mean sd 0.025quant 0.5quant
## Precision for the Gaussian observations 0.002 0.000 0.002 0.002
## Precision for ID 0.004 0.000 0.003 0.004
## Phi for ID 0.202 0.054 0.118 0.194
## Precision for LRR 0.001 0.000 0.000 0.000
## Precision for GDD 0.028 0.005 0.019 0.028
## Precision for SDD 0.018 0.006 0.007 0.018
## Precision for TP 0.117 0.029 0.065 0.115
## 0.975quant mode
## Precision for the Gaussian observations 0.002 0.002
## Precision for ID 0.004 0.004
## Phi for ID 0.328 0.176
## Precision for LRR 0.001 0.000
## Precision for GDD 0.041 0.027
## Precision for SDD 0.029 0.017
## Precision for TP 0.179 0.112
##
## Expected number of effective parameters(stdev): 929.28(0.03)
## Number of equivalent replicates : 5.72
##
## Deviance Information Criterion (DIC) ...............: 47229.89
## Deviance Information Criterion (DIC, saturated) ....: 5088.50
## Effective number of parameters .....................: 811.24
##
## Marginal log-Likelihood: -24442.80
## CPO and PIT are computed
##
## Posterior marginals for the linear predictor and
## the fitted values are computed
Assess non-linearities:
sdd <- nonlinear_effect(modr, "SDD")
gdd <- nonlinear_effect(modr, "GDD")
tp <- nonlinear_effect(modr, "TP")
sdd
gdd
tp
GDD and SDD reflect known and well-documented relationships. TP response curve is odd. How about area-specific residuals (ui). These are the modr$summary.random$ID[1:n, c(1, 2, 3)] values.
modcty <- spatial_effects(modr, nulli2, level="ID")
modcty
Now for the spatially-structured residuals only:
modres <- spatial_residuals(modr, nulli2, level="ID")
modres
How about regional (LRR) effects:
modlrr <- spatial_effects(modr, nulli2, level="LRR")
modlrr
Look at residuals:
res.lin <- (nulli2$YIELD - modr$summary.fitted.values$mean)/modr$summary.fitted.values$sd # summary of presducted mu hat values
hist(res.lin, 100)
How about general model diagnostics:
modd$DIC
## [1] 47229.89
modd$CPO
## [1] -23681.92
modd$MSE
## [1] 287.149
modd$R2
## [1] 0.7098722
The probability integral transform (PIT) is another leave-one-out-cross-validation check that evaluates the predictive performance of the model where a Uniform distribution of PIT means the predictive distribution matches well with the data. In our case, the thing tends towards uniform, with some extreme values in both directions (consider trimming or inspecting these outliers).
modd$PIT
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The first PPC shows a scatterplot of the posterior means for the predictive distribution and observed values, where points scattered along a line of slope 1 indicates a very good fit.
modd$PPC_PT
The second PPC provides a histogram of the posterior predictive p-value which estimates the probability of predicting a value more extreme than the observed value, where values near 0 and 1 indicate that the model does not adequately account for the tails of the observed distribution. The tails are due to isolated county-year events which seem to correspond with disaster records.
modd$PPC_HIST
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Finally, compute the proportion of variance explained by the structured spatial component:
# p.186, only if scale.model=T
marg.hyper <- inla.hyperpar.sample(100000, modr)
pv <- mean(marg.hyper[,1]/(marg.hyper[,1]+marg.hyper[,2]))
The proportion of spatial variance is 0.3848099 suggesting that a significant amount of the variaiblity is explained by spatial structure.
lrr <- find_bs(modr, nulli2, level = "LRR", th=2)
mdp <- brewer.pal(n = 9, name = "BrBG")
lrr.layer <- list("sp.polygons", as(lrr_shp, "Spatial"), col = "gray", first=F)
spplot(lrr, "DIFF", main = "Difference between regional and county effects", col.regions = mdp, cuts = 8, col = "transparent", sp.layout = lrr.layer)
plot_bsds(lrr)
lrr <- find_bs(modr, nulli2, level = "LRR", th=1.5)
plot_bsds(lrr)
Comparing how characteristics change from one community (in our case bright/dark spots) to another.
Resources: * https://jonlefcheck.net/2012/10/24/nmds-tutorial-in-r/
library(rfUtilities)
set.seed(213)
nulli$OUT <- ifelse(nulli$BS == "Bright", 1, 0)
qc <- as.data.frame(colSums(is.na(nulli)))
qc <- cbind(qc, rownames(qc))
bad_vars <- qc$`rownames(qc)`[qc$`colSums(is.na(nulli))` > 0.80*nrow(nulli)] # more than 80% missing
# drop bad_vars
varData <- nulli %>% select(-c(bad_vars)) %>% drop_na
varImportance <- rf.modelSel(varData, nulli$OUT, imp.scale="se")
cross_val <- function(formula, data, perc_ho = 0.25) {
# revisit this b/c my sense is that the CPO/PIT automatically do LOOCV
set.seed(100)
random_rn <- sample(nrow(data), ceiling(nrow(data)*perc_ho))
ho <- data[random_rn,]
sample <- data[-random_rn,]
print("Running model on sample...")
out <- run_model(formula, sample, save=F)
print("Sample model run complete.")
fdf <- cbind(sample, out$summary.fitted.values$mean)
colnames(fdf)[dim(fdf)[2]] <- c("FittedVals")
fdf$Error <- fdf$YIELD - fdf$FittedVals
MSE <- 1/(length(fdf$YIELD)*sum((fdf$YIELD - fdf$FittedVals)^2, na.rm=T))
pred_res2<-(fdf$FittedVals[!is.na(fdf$YIELD)] - mean(fdf$YIELD, na.rm=T)) ^2
obs_res2<-(fdf$YIELD[!is.na(fdf$YIELD)] - mean(fdf$YIELD, na.rm=T))^2
R2<-sum(pred_res2, na.rm=T)/sum(obs_res2, na.rm=T)
}
y12 <- null %>% filter(YEAR == 2012)
hist(y12$YIELD, 100)
ggpubr::ggqqplot(y12$YIELD)
shapiro.test(y12$YIELD[0:5000])
Significant Shapiro test means that the distribution of the data is significantly different from normal by heavy tails, i.e. more extreme values than if this really came from a normal distribution, data more dispersed than assumed by a Normal distribution. Consider the non-central t-distrubtion which allows for fatter tails (reference here). Or start with Normal distribution and see if things converge.